因為之後想花一點時間分享一下 Transformer 閱讀跟實作的經驗,所以這篇就沒寫 Transformer 的部分,但它生成的音樂還是會貼在下方跟大家分享
書接昨日,我們就開始寫吧!
def build_discriminator():
model=Sequential([
Conv1D(32,(3),strides=(2),input_shape=(SEQ_LEN,1,)),
LeakyReLU(),
Conv1D(64, (3), strides=(2), padding='same',use_bias=False),
BatchNormalization(),
LeakyReLU(),
Conv1D(64, (3), strides=(2), padding='same',use_bias=False),
BatchNormalization(),
LeakyReLU(),
Flatten(),
Dense(1)
])
return model
def build_generator():
model = Sequential([
Bidirectional(LSTM(128, return_sequences=True), input_shape=(gen_len, gen_len)),
LeakyReLU(alpha=0.2),
Bidirectional(LSTM(128, return_sequences=True)),
LeakyReLU(alpha=0.2),
Bidirectional(LSTM(128)),
LeakyReLU(alpha=0.2),
# specifying output to have 40 timesteps
RepeatVector(seq_len),
# specifying 1 feature as the output
Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2)),
LeakyReLU(alpha=0.2),
Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2)),
LeakyReLU(alpha=0.2),
Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2)),
LeakyReLU(alpha=0.2),
Dropout(0.3),
TimeDistributed(Dense(128)),
LeakyReLU(alpha=0.2),
Dropout(0.4),
TimeDistributed(Dense(128)),
LeakyReLU(alpha=0.2),
Dropout(0.4),
TimeDistributed(Dense(1)),
# back to 0 ~ 1
Activation("sigmoid"),
])
noise = Input(shape=(gen_len,gen_len))
img = model(noise)
return Model(noise, img)
def build_discriminator():
model = Sequential([
Bidirectional(LSTM(128, return_sequences=True), input_shape=(seq_len, 1)),
Activation("relu"),
LeakyReLU(alpha=0.2),
Bidirectional(LSTM(128)),
Activation("relu"),
LeakyReLU(alpha=0.2),
Dropout(0.4),
RepeatVector(1),
TimeDistributed(Dense(128, activation = 'sigmoid')),
LeakyReLU(alpha=0.2),
Dropout(0.4),
TimeDistributed(Dense(128, activation = 'relu')),
LeakyReLU(alpha=0.2),
Dropout(0.4),
TimeDistributed(Dense(1, activation = 'linear'))
])
img = Input(shape=(seq_len,1))
validity = model(img)
return Model(img, validity)
def WaveGANGenerator():
model = tf.keras.Sequential([
Dense(seq_len, activation='relu',input_shape=(seq_len,)),
Reshape((1,seq_len)),
Conv1D(64, kernel_size=25, strides=4, padding="same"),
BatchNormalization(momentum=0.8),
ReLU(),
Conv1D(128,kernel_size=25, strides=4,padding='same'),
BatchNormalization(momentum=0.8),
ReLU(),
Conv1D(seq_len,kernel_size=25,strides=4, padding='same'),
BatchNormalization(momentum=0.8),
ReLU(),
Conv1D(seq_len,kernel_size=25,strides=4, padding='same'),
BatchNormalization(momentum=0.8),
ReLU(),
Flatten(),
Dense(seq_len, activation='sigmoid')
])
return model
def WaveGANDiscriminator():
model = tf.keras.Sequential([
Dense(seq_len, activation='relu',input_shape=(seq_len,)),
Reshape((1,seq_len)),
Conv1D(64, kernel_size=25, strides=4, padding="same"),
BatchNormalization(momentum=0.8),
ReLU(),
Conv1D(seq_len,kernel_size=25,strides=4, padding='same'),
BatchNormalization(momentum=0.8),
ReLU(),
Conv1D(seq_len,kernel_size=25,strides=4, padding='same'),
BatchNormalization(momentum=0.8),
ReLU(),
Flatten(),
Dense(seq_len, activation='sigmoid')
])
return model
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
def generator_loss(fake_output):
return cross_entropy(tf.ones_like(fake_output), fake_output)
def discriminator_loss(real_output, fake_output):
real_loss = cross_entropy(tf.ones_like(real_output), real_output)
fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
total_loss = real_loss + fake_loss
return total_loss
import time
total_Gloss = []
total_Dloss = []
def train(dataset, epochs):
for epoch in range(epochs):
start = time.time()
G_loss = 0
D_loss = 0
for i,image_batch in enumerate(dataset):
# 見下方
gen_loss,disc_loss = train_step(image_batch)
print(f"Step:{i} | G_loss:{gen_loss} D_loss:{disc_loss}|")
G_loss += gen_loss
D_loss += disc_loss
clear_output(wait=True)
print (f'Time for epoch {epoch + 1} is {time.time()-start} sec\n')
print(f'G_AVE_Loss:{G_loss/len(dataset)}')
print(f'D_AVE_loss:{D_loss/len(dataset)}')
total_Gloss.append(G_loss/len(dataset))
total_Dloss.append(D_loss/len(dataset))
@tf.function
def train_step(music):
LAMBDA = 10
noise = tf.random.normal([BATCH_SIZE,seq_len])
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
generated_music = generator(noise, training=True)
real_output = discriminator(music, training=True)
fake_output = discriminator(generated_music, training=True)
gen_loss = generator_loss(fake_output)
disc_loss = discriminator_loss(fake_output,real_output)
gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
return gen_loss,disc_loss
import random
from mido import MidiFile, MidiTrack, Message
noise = np.random.normal(0,1,(1,seq_len))
predict = generator.predict(noise)
predict = predict*127
midler = MidiFile()
track = MidiTrack()
midler.tracks.append(track)
track.append(Message('program_change', program=2, time=0))
for x in range(seq_len):
# 這裡就是前面說的,訓練的部分只有音符排列,節奏跟控制都沒有訓練到,所以都是隨機生成的
on_interval = random.randint(0,127)
off_interval = random.randint(0,127)
change_interval = random.randint(0,127)
change_value = random.randint(0,127)
isControl = random.randint(0,1)
track.append(Message('note_on',channel =1, note=int(predict[0][x]), velocity=64, time = on_interval))
if isControl:
track.append(Message('control_change',channel =1, control=64, value=change_value, time = change_interval))
track.append(Message('note_off',channel =1 ,note=int(predict[0][x]), velocity=64, time = off_interval))
midler.save('WaveGan.mid')
雖然在訓練的時候沒有用到節奏跟控制來訓練,我們只有訓練音符的排列,但結果跟隨機亂生的聽起來就是不一樣,評斷音樂的標準定義還是相當模糊的(至少對像我這樣不懂音樂的麻瓜來說),還是要實際聽看看比較能夠體會,所以最後附上我生成的結果XD。